import math
import random 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

def pretty_print_linear(coefs, intercept, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(1,1+len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return "%6.3f"%intercept+" + " +" + ".join("%6.3f * %s" % (coef, name)
                                   for coef, name in lst)


#generate some data
x = np.arange(20)
w = np.array([-3.8,0.11])
y = w[0] + w[1]*x
sigma2 = 0.1
y = y + np.random.normal(0,np.sqrt(sigma2),x.shape[0])
plt.scatter(x,y)

<matplotlib.collections.PathCollection at 0x7fd858496820>


def calculateSquaredLoss(X,y,w):
    X1 = np.hstack([np.ones([X.shape[0],1]),X])
    ypred = np.dot(X1,w)
    return 0.5*np.sum(np.power(ypred - y,2))


from sklearn.linear_model import LinearRegression
# fit model - note that LinearRegression's fit function adds the intercept by default
x = np.transpose(np.reshape(x,[1,len(x)]))
y = np.transpose(np.reshape(y,[1,len(y)]))
lr = LinearRegression()
lr.fit(x,y)
print(pretty_print_linear(lr.coef_,lr.intercept_))

-4.005 +  0.112 * X1


data_size = 500
data = pd.DataFrame({"covid_deaths_per_person_in_state": [random.gauss(.01,1) for _ in range(data_size)],
                     "covid_cases_per_state": [random.gauss(1000,10) for _ in range(data_size)]
                    })
data['local_news_covid_articles'] = data.covid_deaths_per_person_in_state + data.covid_cases_per_state

mod = LinearRegression(fit_intercept=False).fit(data[['covid_deaths_per_person_in_state',
                                                      'covid_cases_per_state']], 
                                                data.local_news_covid_articles)
mod.coef_

array([1., 1.])


data = pd.DataFrame({"exercised" : random.choices(['yes','no'], k=500)})

## Mini-quiz ... what is this code doing?
data['water'] = 10 + 3*(data.exercised == "yes").astype(int)


mod = LinearRegression(fit_intercept=False).fit(data[['exercised']], 
                                                data.water)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/rf/4jz25sh1733_8b28phhhrlfh0000gn/T/ipykernel_51448/833097770.py in <module>
----> 1 mod = LinearRegression(fit_intercept=False).fit(data[['exercised']], 
      2                                                 data.water)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_base.py in fit(self, X, y, sample_weight)
    660         accept_sparse = False if self.positive else ["csr", "csc", "coo"]
    661 
--> 662         X, y = self._validate_data(
    663             X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
    664         )

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    579                 y = check_array(y, **check_y_params)
    580             else:
--> 581                 X, y = check_X_y(X, y, **check_params)
    582             out = X, y
    583 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
    962         raise ValueError("y cannot be None")
    963 
--> 964     X = check_array(
    965         X,
    966         accept_sparse=accept_sparse,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    744                     array = array.astype(dtype, casting="unsafe", copy=False)
    745                 else:
--> 746                     array = np.asarray(array, order=order, dtype=dtype)
    747             except ComplexWarning as complex_warning:
    748                 raise ValueError(

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in __array__(self, dtype)
   1991 
   1992     def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
-> 1993         return np.asarray(self._values, dtype=dtype)
   1994 
   1995     def __array_wrap__(

ValueError: could not convert string to float: 'no'


data['exercised_yes'] = (data.exercised == "yes").astype(int)
data['exercised_no'] = (data.exercised == "no").astype(int)
mod = LinearRegression(fit_intercept=False).fit(data[['exercised_yes','exercised_no']], 
                                                data.water)
mod.coef_

array([13., 10.])


from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://miro.medium.com/max/384/1*dxzjw6RMg6myHwk1HEKdKQ.png",
     width=400)


MIN_X = -1
MAX_X = 6

def f(x):
    """
    True function f(x) (unknown to us)
    """
    return 2 * x - 1.4 * x ** 2 + .2 * x ** 3


px.line(pd.DataFrame({"x": np.linspace(MIN_X,MAX_X, 30), 
                      "y" : [f(x) for x in np.linspace(MIN_X,MAX_X, 30)]}),
       x="x",y="y",
       height=400,width=400)


def generate_data(n,min_x=0,max_x=1, noise_sd = .1):
    """
    Creates and returns a SFrame with n randomly drawn examples
    """
    xs = [random.uniform(min_x, max_x) for _ in range(n)]   # generate n numbers uniform at random from [0, 1]
    ys = [f(x) + random.gauss(0, noise_sd) for x in xs] # evaluate f at each x and add Gaussian noise (mean=0, variance=.1)
    return pd.DataFrame({'x': xs, 'y': ys})


def polynomial_features(data, col, deg):
    """
    Given a dataset, creates a polynomial expansion of the input with 
    the given name to the given degree.
    
    Returns the dataset and the list of column names
    """
    data_copy = data.copy()
    if deg == 0:
        data_copy[col + '0'] = 0
        columns = [col + '0']
    else:
        columns = []
        for i in range(1, deg + 1): # +1 to include deg
            col_name = col + str(i)
            data_copy[col_name] = data_copy[col] ** i
            columns.append(col_name)
        
    return data_copy, columns


data = generate_data(n=50)
data, columns = polynomial_features(data, 'x', deg=3)
print(columns)
data.head()

['x1', 'x2', 'x3']


def learn_models(data, features, num_models):
    """
    Trains num_models models on random subsets of the given data. 
    Returns the list of learned models.
    """
    models = []
    for i in range(num_models):
        # get a random sample of the datat
        random_sample_from_X, _, random_sample_from_y, _=  train_test_split(data[features].values,data.y.values,test_size=0.4)
        model = LinearRegression().fit(random_sample_from_X,random_sample_from_y)
        models.append(model)
    return models


models = learn_models(data, columns, 3)
for mod in models:
    print(pretty_print_linear(mod.coef_,mod.intercept_))

-0.078 +  2.713 * X1 + -2.887 * X2 +  1.146 * X3
 0.099 +  1.294 * X1 + -0.106 * X2 + -0.512 * X3
 0.001 +  1.884 * X1 + -0.772 * X2 + -0.317 * X3


def find_predictions_range(models, model_degree, num_eval_points, min_x=0, max_x=1):
    """
    Compares the predictions of the given models at num_eval_points between min_x and max_x.
    
    This gives a description of how spread the predictions are at a particular point x. 
    If the predictions are all over the place, then the range of predictions will be high.
    
    Returns:
        * A list of points evaluated (of length num_eval_points)
        * A list of tuples containing information about the predictions at each eval point. 
          Each tuple contains the minimum prediction, mean prediction, and max prediction for that input
          
    """
    new_xs = np.linspace(min_x, max_x, num_eval_points) # just like range, but allows for fractional steps
    new_data = pd.DataFrame({'x': new_xs})
    new_data, columns = polynomial_features(new_data, 'x', model_degree)

    all_predictions = [model.predict(new_data[columns].values) for model in models]
    intervals = []
    for i in range(len(new_xs)):        
        # get predictions from each model for this input i
        preds_i = [preds[i] for preds in all_predictions]
        # find the mean, min, and max
        intervals.append((min(preds_i), np.mean(preds_i), max(preds_i)))
        
    return new_xs, intervals


x, model_pred_intervals = find_predictions_range(models, 3, 10, MIN_X,MAX_X)
print(x)
print("_________")
print(model_pred_intervals[0])

[-1.         -0.22222222  0.55555556  1.33333333  2.11111111  2.88888889
  3.66666667  4.44444444  5.22222222  6.        ]
_________
(-6.823340291037015, -3.317104640622015, -0.7897900273220814)


def plot(data, new_xs, intervals,min_x=0,max_x=1):
    """
    Plots the data and range of predictions to demonstrate the bias and variance of the model
    """
    # magic to separate list of [(low, mean, high)] to list of lows, list of means, list of highs
    lows, means, highs = zip(*intervals)       
    
    # plot the range bands for predictions
    plt.fill_between(new_xs, lows, highs, facecolor='#E8F8F8', edgecolor='#c3e2e2',
                     label='Range of predictions')
    # plot the average predictions (won't perfectly match f because it's an approximation of E_train[hat{f}]
    plt.plot(new_xs, means, color="r", label='Average hat{f}')
    # plot the true f
    plt.plot(new_xs, [f(x) for x in new_xs], color='g', label='True f')
    # plot the datapoints
    plt.scatter(data['x'], data['y'], c='black', s=10, label='Data')

    # modify axis to make it look nice and add label
    axes = plt.gca()
    axes.set_xlim([min_x, max_x])
    axes.set_ylim([data['y'].min() - 0.5, data['y'].max() + 0.5])
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.show()


def bias_variance(num_data, num_models, degree_expansions, num_eval_points=200, min_x=0, max_x=1,noise_sd=.1):
    """
    Demonstrates the bias-variance tradeoff for each degree in degree_expansions using 
    one randomly generated dataset of num_dat samples with value some true f(x) + noise.
    
    For each degree, trains num_models models using features up to that degree. It then uses
    those models to make predictions for the num_eval_points between 0 and 1. It records 
    these predictions and visually plots the range of predictions for each x.
    """
    print('Generating data')
    data = generate_data(num_data,min_x,max_x,noise_sd)
    for degree_expansion in degree_expansions:
        print('Evaluating model with degree ' + str(degree_expansion))
        # generate data with 
        data, columns = polynomial_features(data, 'x', degree_expansion)
        models = learn_models(data, columns, num_models)
        new_xs, intervals = find_predictions_range(models, degree_expansion, num_eval_points,min_x,max_x)
        plot(data, new_xs, intervals,min_x,max_x)


bias_variance(num_data= 30, 
              num_models = 100, 
              degree_expansions = [0, 1, 2, 5, 10], 
              num_eval_points = 50, 
              min_x=MIN_X, 
              max_x=MAX_X, 
              noise_sd=1)

Generating data
Evaluating model with degree 0

Evaluating model with degree 1

Evaluating model with degree 2

Evaluating model with degree 5

Evaluating model with degree 10

	x	y	x1	x2	x3
0	0.242160	0.409650	0.242160	0.058641	0.014201
1	0.341757	0.602137	0.341757	0.116798	0.039916
2	0.472779	0.671901	0.472779	0.223520	0.105676
3	0.829565	0.760769	0.829565	0.688178	0.570889
4	0.417905	0.494490	0.417905	0.174645	0.072985

Model Evaluation (mostly, for regression, but w/ some more general points)¶

Interpreting coefficients for linear regression¶

Some considerations in interpreting coefficients¶

Correlation is not causation¶

Knowing a relationship is "real"¶

Interpretation is based on variable scale¶

Interpretation is based on what your null model represents!¶

Making predictions with a model¶

Finding some models to compare¶

Actually comparing models ... when we know the truth¶

!!!!!!!! Disclaimer: this is a modified version of Hunter Schafer's demo [here](https://courses.cs.washington.edu/courses/cse416/18sp/notebooks/html/bias-variance.html) !!!!!¶

Reminder: the high-level goal¶

Where does error come from?¶

Step 1 - Define our "true function" we're going to try to learn¶

Step 2 - Define functions to simulate random training dataset¶

Step 3 - Learn model(s) based on (a set of) training datasets¶

Step 4 - Make a bunch of predictions at various values of x¶

Step 4 - Run some experiments!¶

Model evaluation without access to the truth¶